Comparing genomic offset predictions

Author

Juliette Archambeau

Published

August 17, 2023

We compare the genomic offset predictions from the four following methods:

Code
# Method names
meth_names <- c("GDM","LFMM","GF","RDA")

# population names
pop_names <- readRDS(here("data/GenomicData/MainGenePoolPopulations.rds")) %>%  arrange(pop) %>% pull(pop)

# SNP sets
snp_sets <- readRDS(here("outputs/list_snp_sets.rds"))

Comparing population ranks

In this section, we compare GO predictions from the different methods (GF, GDM, RDA and LFMM), SNP sets (candidates, control and all SNPs for LFMM) and GCMs. For that, for each combination of method/SNP set/GCM, we ranked the populations based on the GO predictions, i.e. populations with the lowest rank had the highest GO predictions.

Across methods/SNP sets

We first compared the population ranks from the different combinations method/SNP set. For that, we generated one plot per GCM. We also generated a plot in which the population ranks are not based on an unique GCM but are based on the average GO predictions across all GCMs. In the plots below, we colored the populations that were among the three populations with the highest genomic offset in at least one combination method/SNP set.

Code
# Ranking the populations for each combination method/SNP set/GCM
# ===============================================================

go_df <- lapply(meth_names, function(meth_name) {
  
if(meth_name=="LFMM"){
  
  go_predictions <- readRDS(file=here(paste0("outputs/LFMM/go_predictions_snpsets.rds")))} else {
  
  go_predictions <- readRDS(file=here(paste0("outputs/",meth_name,"/go_predictions.rds")))
  }
  
go_predictions <-  go_predictions %>% 
    lapply(function(snp_set) {

    snp_set$go %>% 
        lapply(function(gcm){
          
        tibble(pop = pop_names,
               go = gcm) %>%
            arrange(go) %>% 
            mutate(rank = rev(1:length(pop_names)))
        
      }) %>% bind_rows(.id="gcm")
        
    }) %>% bind_rows(.id="snp_set")

if(meth_name=="LFMM"){
  
go_predictions <-    readRDS(file=here(paste0("outputs/LFMM/go_predictions_allsnps.rds")))$go %>% 
    lapply(function(gcm){
          
        tibble(pop = pop_names,
               go = gcm,
               snp_set = "all") %>%
            arrange(go) %>% 
            mutate(rank = rev(1:length(pop_names)))
        
      }) %>% 
    bind_rows(.id="gcm") %>% 
    bind_rows(go_predictions)
  
  
  } 

go_predictions

}) %>% 
  setNames(meth_names) %>% 
  bind_rows(.id="method")



# ============================================================================================================
# ============================================================================================================

# For each combination of method (GF, GDM, LFMM and RDA) and SNP set (candidates, control and all SNPs for LFMM), 
# we calculate the average of the GO predictions across the five GCMs, 
# and then we rank the populations based on their GO predicted value.


# First option (that we should use I think): take the mean of the GO predictions
go_df <- go_df %>% 
  group_by(pop,method,snp_set) %>% 
  summarise(go=mean(go)) %>% 
  ungroup() %>% 
  group_split(method,snp_set) %>% 
  lapply(function(x){
    x %>% 
      arrange(go) %>% 
      mutate(rank = rev(1:length(pop_names)),
             gcm = "GCMs_average")
  }) %>% 
  bind_rows() %>% 
  bind_rows(go_df)

# Another option (the first I use, but I think we should use the option above): take the mean of the ranks
# go_df %>%
#   group_by(pop,method,snp_set) %>%
#   summarise(mean_rank=mean(rank),go=mean(go)) %>%
#   ungroup() %>%
#   group_split(method,snp_set) %>%
#   lapply(function(x){
#     x %>%
#       arrange(mean_rank) %>%
#       mutate(rank = 1:length(pop_names),
#              gcm = "GCM average") %>%
#       dplyr::select(-mean_rank)
#   }) %>%
#   bind_rows() %>%
#   bind_rows(go_df)


# =====================================
# =====================================

# FIGURE TITLE AND LABELS

# We create a vector that we will use for the x-axis labels and the plot title

method_snpset_names <- c("GDM (candidates)",
                         "GDM (control)",
                         "GF (candidates)",
                         "GF (control)",
                         "LFMM (all SNPs)",
                         "LFMM (candidates)",
                         "LFMM (control)",
                         "RDA (candidates)",
                         "RDA (control)",
                         "Average across methods and SNP sets")

names(method_snpset_names) <- go_df %>% 
  filter(! (snp_set == "cand_corrected")) %>%
  mutate(meth_snpset = paste0(method, "_", snp_set)) %>% 
  pull(meth_snpset) %>% 
  unique() %>% 
  c(.,"method_snpset_average")
Code
# =================================
# Code the generate the bump charts
# =================================


# the colors I will use to color the populations with high GO
my_colors <- c(brewer.pal(n=12, "Paired"),"#FF40EE")

bump_charts <- go_df %>% 
  filter(! (snp_set == "cand_corrected")) %>% 
  mutate(x_axis = method_snpset_names[paste0(method, "_", snp_set)]) %>%
  group_split(gcm) %>% 
  lapply(function(x){
    
high_go_pops <- x %>% 
  filter(rank < 4) %>% 
  pull(pop) %>% 
  unique()    

my_palette <- c(my_colors[1:length(high_go_pops)],"#E8E8E8")

if(unique(x$gcm)=="GCMs_average"){
  plot_title <- "Average across the five GCMs"
} else {
  plot_title <- unique(x$gcm)
}

sub <- x %>% 
  mutate(flag = ifelse(pop %in% high_go_pops, TRUE, FALSE),
         pop_col = if_else(flag == TRUE, pop, "Others")) %>% 
  mutate(pop = factor(pop, levels=c(setdiff(unique(x$pop),high_go_pops),high_go_pops)),
         pop_col = factor(pop_col, levels = c(high_go_pops,"Others")))

p <- sub %>% ggplot(aes(x = x_axis, y = rank, group = pop)) +
  geom_point(aes(color = pop_col), size = 2, alpha = 0.9) +
  geom_line(aes(color = pop_col), linewidth = 2, alpha = 0.8) +
  scale_y_reverse(breaks = 1:nrow(sub)) +
  scale_color_manual(values = my_palette) +
  geom_text(data = sub %>% filter(x_axis == sub$x_axis[[length(sub$x_axis)]] & pop %in% high_go_pops),
            aes(label = pop, x = 9.1) , hjust = 0.15, color = "gray20", size = 3) +#color = "#888888",
  theme_bw() +
  ylab("Population rank") +
  ggtitle(plot_title) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.major.x = element_blank(),
        legend.position = "none",
        axis.title.x = element_blank())

ggsave(p, filename = here(paste0("figs/PredictionVariability/BumpChart_PopRank_",unique(x$gcm),".pdf")), device = "pdf", width=12)

if(unique(x$gcm)=="GCMs_average"){
  p_manuscript <- p + ggtitle("") 
  ggsave(p_manuscript, 
         filename = here(paste0("figs/PredictionVariability/BumpChart_PopRank_",unique(x$gcm),"_notitle.pdf")), device = "pdf", width=12)} 

p

  })

pdf(here("figs/PredictionVariability/BumpCharts_PerMethSnpSet.pdf"), width=12)
lapply(bump_charts, function(x) x)
dev.off()

bump_charts

Across GCMs

We then compared the population ranks from the different GCMs (or from the average GO predictions across the five GCMs). For that, we generated one plot per combination method/SNP set. We also generated a plot in which the population ranks are not based on an unique combination method/SNP set but are based on the average GO predictions across all combinations method/SNP set.

In the plots below, we colored the populations that were among the three populations with the highest genomic offset in at least one GCM.

Code
go_df_gcm <- go_df %>% 
  filter(! (snp_set == "cand_corrected")) %>% 
  mutate(meth_snpset = paste0(method, "_", snp_set)) %>% 
  dplyr::select(-method,-snp_set)
  
  
bump_charts <- go_df_gcm %>% 
  group_by(pop,gcm) %>% 
  summarise(go =mean(go)) %>% 
  ungroup() %>% 
  group_split(gcm) %>% 
  lapply(function(x){
    x %>% 
      arrange(go) %>% 
      mutate(rank = rev(1:length(pop_names)))
  }) %>% 
  bind_rows() %>% 
  mutate(meth_snpset = "method_snpset_average") %>% 
  bind_rows(go_df_gcm) %>% 
  group_split(meth_snpset) %>% 
  lapply(function(x){

high_go_pops <- x %>% 
  filter(rank < 4) %>% 
  pull(pop) %>% 
  unique()    

my_palette <- c(my_colors[1:length(high_go_pops)],"#E8E8E8")

df <- x %>% 
  mutate(flag = ifelse(pop %in% high_go_pops, TRUE, FALSE),
         pop_col = if_else(flag == TRUE, pop, "Others")) %>% 
  mutate(pop = factor(pop, levels=c(setdiff(unique(go_df_gcm$pop),high_go_pops),high_go_pops)),
         pop_col = factor(pop_col, levels = c(high_go_pops,"Others")))


p <- df %>% ggplot(aes(x = gcm, y = rank, group = pop)) +
  geom_point(aes(color = pop_col), size = 2, alpha = 0.9) +
  geom_line(aes(color = pop_col), linewidth = 2, alpha = 0.8) +
  scale_y_reverse(breaks = 1:nrow(go_df_gcm)) +
  scale_color_manual(values = my_palette) +
  geom_text(data = df %>% filter(gcm == "UKESM1-0-LL" & pop %in% high_go_pops),
            aes(label = pop, x = 6.1) , hjust = 0.15, color = "gray20", size = 3) +#color = "#888888",
  theme_bw() +
  ylab("Population rank") +
  ggtitle(paste0(method_snpset_names[unique(x$meth_snpset)])) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.major.x = element_blank(),
        legend.position = "none",
        axis.title.x = element_blank())

ggsave(p, filename = here(paste0("figs/PredictionVariability/BumpChart_PopRank_",unique(x$meth_snpset),".pdf")), device = "pdf", width=10)

p
  })



pdf(here("figs/PredictionVariability/BumpCharts_PerGCM.pdf"), width=10)
lapply(bump_charts, function(x) x)
dev.off()

bump_charts

Correlations

Code
go_df %>% 
  filter(! (snp_set == "cand_corrected")) %>% 
  mutate(meth_snpset = method_snpset_names[paste0(method, "_", snp_set)]) %>% 
  group_split(gcm) %>% 
  lapply(function(x){
    
    df_cor <- x %>% 
      pivot_wider(id_cols=pop, values_from = go, names_from = meth_snpset)  %>% 
      dplyr::select(-pop) %>% 
      cor()
    
    
    pdf(here(paste0("figs/PredictionVariability/CorrelationPlot_",paste0(unique(x$gcm)),".pdf")))
    df_cor  %>% corrplot(method = 'number',type = 'lower', diag = FALSE,
             title="",
             mar=c(0,0,0,0),
             number.cex=1,
             tl.cex=1)
    dev.off()
      
      
    df_cor %>% corrplot(method = 'number',type = 'lower', diag = FALSE,
               title=paste0(unique(x$gcm)),mar=c(0,0,2,0),
               number.cex=1,
               tl.cex=1)
  })

Maps

Code
list_highpops <- list(LFMM = readRDS(here("outputs/LFMM/high_go_pops.rds"))[[2]],
                      RDA = readRDS(here("outputs/RDA/high_go_pops.rds"))[[2]],
                      GDM = readRDS(here("outputs/GDM/high_go_pops.rds"))[[2]],
                      GF = readRDS(here("outputs/GF/high_go_pops.rds"))[[2]])

list_highpops <- list(LFMM = readRDS(here("outputs/LFMM/high_go_pops.rds"))[[2]],
                      RDA = readRDS(here("outputs/LFMM/high_go_pops.rds"))[[2]],
                      GDM = readRDS(here("outputs/LFMM/high_go_pops.rds"))[[2]],
                      GF = readRDS(here("outputs/LFMM/high_go_pops.rds"))[[2]])

list_highpops[[1]] <- list_highpops[[1]] + theme(legend.position = "none")
list_highpops[[2]] <- list_highpops[[2]] + theme(legend.position = "none")
list_highpops[[3]] <- list_highpops[[3]] + theme(legend.position = "none")

plot_grid(plotlist = list_highpops, nrow=2)